true
#if(!require(installr)) {
#install.packages(installr); require(installr)} #load / install+load installr
# using the package:
#updateR()
#update.packages(ask = FALSE)
#install.packages("lifecycle")
#install.packages(tidymodels)
#install.packages("rsample")
library(readr)
library(readxl)
library(tidyr)
library(tidymodels)
library(dplyr)
library(tidyverse)
library(fastDummies)
library(reshape2)
library(ggplot2)
library(caTools)
library(relaimpo)
library(MASS)
player_df <- read_excel("../dataset-ignore/19-20 palyer total (1).xlsx")
head(player_df)
## # A tibble: 6 × 31
##      Rk Player Pos     Age Tm        G    GS    MP    FG   FGA `FG%`  `3P` `3PA`
##   <dbl> <chr>  <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     1 Steve… C        26 OKC      63    63  1680   283   478 0.592     1     3
## 2     2 Bam A… PF       22 MIA      72    72  2417   440   790 0.557     2    14
## 3     3 LaMar… C        34 SAS      53    53  1754   391   793 0.493    61   157
## 4     4 Kyle … C        23 MIA       2     0    13     1     2 0.5       0     0
## 5     5 Nicke… SG       21 NOP      47     1   591    98   266 0.368    46   133
## 6     6 Grays… SG       24 MEM      38     0   718   117   251 0.466    57   141
## # … with 18 more variables: `3P%` <dbl>, `2P` <dbl>, `2PA` <dbl>, `2P%` <dbl>,
## #   `eFG%` <dbl>, FT <dbl>, FTA <dbl>, `FT%` <dbl>, ORB <dbl>, DRB <dbl>,
## #   TRB <dbl>, AST <dbl>, STL <dbl>, BLK <dbl>, TOV <dbl>, PF <dbl>, PTS <dbl>,
## #   `Player-additional` <chr>

Data Preparation

play_num = subset(player_df, select = -c(Player, Rk,Tm,`Player-additional`,GS,FG, FGA,`2P`,`3PA`,`2PA`,`2P%`, `3P%`, `eFG%`, `FG%`,FT, FTA,TRB, PTS))
play_num <- drop_na(play_num)


length(play_num)
## [1] 13
dummy <- dummy_cols(play_num, select_columns = c("Pos"), remove_first_dummy = TRUE)
model_data <- subset(dummy, select = -c(Pos))
num_data <- play_num %>% dplyr::select(where(is.numeric))

Exploratory Data Analysis

summary(num_data)
##       Age              G               MP               3P        
##  Min.   :19.00   Min.   : 1.00   Min.   :   4.0   Min.   :  0.00  
##  1st Qu.:23.00   1st Qu.:19.00   1st Qu.: 270.5   1st Qu.:  4.25  
##  Median :25.00   Median :43.00   Median : 806.5   Median : 28.00  
##  Mean   :25.83   Mean   :40.03   Mean   : 908.8   Mean   : 46.07  
##  3rd Qu.:28.00   3rd Qu.:60.00   3rd Qu.:1477.2   3rd Qu.: 70.00  
##  Max.   :43.00   Max.   :74.00   Max.   :2556.0   Max.   :299.00  
##       FT%              ORB              DRB             AST       
##  Min.   :0.0000   Min.   :  0.00   Min.   :  0.0   Min.   :  0.0  
##  1st Qu.:0.6670   1st Qu.:  9.00   1st Qu.: 37.0   1st Qu.: 18.0  
##  Median :0.7705   Median : 24.00   Median : 96.0   Median : 52.5  
##  Mean   :0.7438   Mean   : 37.86   Mean   :130.8   Mean   : 90.2  
##  3rd Qu.:0.8360   3rd Qu.: 52.00   3rd Qu.:192.8   3rd Qu.:115.0  
##  Max.   :1.0000   Max.   :258.00   Max.   :716.0   Max.   :684.0  
##       STL              BLK              TOV               PF        
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:  7.25   1st Qu.:  4.00   1st Qu.: 14.00   1st Qu.: 28.25  
##  Median : 23.00   Median : 10.00   Median : 37.00   Median : 74.00  
##  Mean   : 28.93   Mean   : 18.59   Mean   : 52.12   Mean   : 78.58  
##  3rd Qu.: 45.00   3rd Qu.: 24.00   3rd Qu.: 74.75   3rd Qu.:122.00  
##  Max.   :125.00   Max.   :196.00   Max.   :308.00   Max.   :278.00

Table 1

The descriptive statistic output shows the mean age of NBA players is approximately 26 years (25.83). The oldest player is 43 years and the youngest is 19 resulting in a range of about 25 years. The maximum number of games a player has started (G) is 74 and the minimum is 1, and this variation could be related to player performance whereby a high-performing player is prioritized. The same logic applies to minutes played per game (MP). Our target variable is 3P, which is the number of 3-point field goals per game. The mean 3-P score is 46.07, the minimum is 0 and the maximum is 299 per game. The 75th quartile is 201, implying that at least 25% of all NBA players score make about 70 3P scores per game. This cohort may represent elite players with consistently good performance, hence, sports organizations should focus on not to lose them to rival teams The histogram below illustrates the distribution of 3P scoring. It is evident that distribution of 3P scores is skewed to the right, as evidenced by most values that lie on the left side of the chart. This means that most NBA players score fewer 3P field goals. However, we can discern the presence of outliers – a bar far away from the rest of other bars. This outlier represents NBA players with exceptionally high 3P scores compared to the general population of NBA players. We are more interested in the general population, and hence, it was worthwhile to remove it to avoid skewing the performance of our model.

hist(num_data$`3P`, main = " Distribution of 3P", xlab = 'bins')

Figure 1 Distribution of 3P Scores

boxplot(play_num$`3P`~ play_num$Pos, 
        main = ' A Boxplot of 3 Point Scoring by Player Position',
        ylab = '3 Point Scoring (3P)', xlab = 'Player Position', col = 'yellow', border = 'brown')

Figure 2 Side-to-Side Boxplot of 3 Point Scores by Position

The boxplot above captures the distribution of 3-point scores by player position. Based on the plot, we can discern a significant difference in 3P scores across player positions, with SF-PF registering the highest mean 3P scores, followed at distance by SF-FG, and SG-PG. This finding supports the inclusion of player position as a predictor of the 3P score and checks whether the existing relationship is statistically significant.

cormatrx  <- round(cor(num_data),3) 
cormat_melt <- melt(cormatrx)

corr_heatmap <- ggplot(data = cormat_melt, aes(x = Var1, y=Var2, label = value, fill = value))+
  geom_tile()
corr_heatmap+
  geom_text(aes(Var1, Var2, label = value), color = "black", size = 4)

set.seed(1)

sample <- sample.split(model_data$`3P`, SplitRatio = 0.7)
train  <- subset(model_data, sample == TRUE)
test   <- subset(model_data, sample == FALSE)

Figure 3 Correlation Plot

The correlation plot above identifies the direction and magnitude of association between pairs of variables in our dataset. It is evident that there is a moderate positive relationship between 3P scores and games played (G, r = 0.644), 3P scores and assists per game (AST, r = 0.613), 3P scores and turnover per game (TOV, r = 0.667), and a strong positive relationship between 3P scores and games played (G, r=0.773). We can also identify the presence of multicollinearity issues in our dataset, as evidenced by the high correlation between pairs of independent variables. For example, there is a high correlation between minutes played (MP) and personal fouls per game (PF, r = 0.897), turnover per game (TOV) and MP (r = 0.853), assist per game (AST) and turnover per game (TOV, r = 0.897). A possible approach to address this issue is to remove one of the variables from each pair. We included a cut-off of 0.85 in our regression model funciton to eliminate one of the independent variables that are highly correlated, preferably one with a higher p-value.

Data Transformation

It is worthwhile to note that issues such as missing values and outliers were addressed by removing rows containing NA and extreme values. Our next step is to perform data transformation. Firstly, we performed dummy encoding on our only categorical data, player position, using the dummy_cols () function and removed the first dummy column to avoid the multicollinearity issue. Secondly, variables, which contain

large values, such as Age, STL (steals per game), MP (minutes played), and BLK (blocks per game) were normalized. Most importantly, all these transformations, including splitting data into training sets were done at once using the recipe function in R and the results were also mirrored to the testing data using the bake function.

Optimal Model: Based on the Lowest AIC Value

rec <- recipe(
  `3P` ~.,
  data = train)%>%
  step_normalize(Age, STL, MP, BLK)%>%
  step_corr(all_numeric_predictors(), threshold = 0.85)
log.rec.prep <- prep(rec, training = train)

train_set <- log.rec.prep %>% 
  bake(new_data = NULL)
l.model = lm(`3P`~., data = train_set)
summary(l.model)
## 
## Call:
## lm(formula = `3P` ~ ., data = train_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -148.376  -14.064   -1.441   11.510  145.102 
## 
## Coefficients: (1 not defined because of singularities)
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -29.77606    9.13458  -3.260 0.001206 ** 
## Age           1.65163    1.48740   1.110 0.267453    
## G             0.97650    0.11234   8.692  < 2e-16 ***
## `FT%`        37.40258   10.65237   3.511 0.000494 ***
## ORB          -0.76994    0.07675 -10.032  < 2e-16 ***
## DRB           0.19292    0.03176   6.074 2.79e-09 ***
## AST           0.11406    0.02531   4.507 8.52e-06 ***
## STL           6.15142    2.84676   2.161 0.031271 *  
## BLK           1.48954    2.49903   0.596 0.551464    
## `Pos_C-PF`   16.42202   30.75384   0.534 0.593635    
## Pos_PF        3.89151    4.88875   0.796 0.426473    
## `Pos_PF-C`    6.72209   18.24347   0.368 0.712711    
## `Pos_PF-SF` -17.63716   21.96377  -0.803 0.422421    
## Pos_PG      -10.07369    6.45317  -1.561 0.119264    
## `Pos_PG-SG`  -4.40865   30.77196  -0.143 0.886147    
## Pos_SF        4.08671    5.53191   0.739 0.460469    
## `Pos_SF-C`         NA         NA      NA       NA    
## `Pos_SF-PF`  14.96257   21.99959   0.680 0.496796    
## `Pos_SF-SG`   9.59393   30.96106   0.310 0.756813    
## Pos_SG       11.67706    5.52524   2.113 0.035154 *  
## `Pos_SG-PG`  20.92139   30.75211   0.680 0.496674    
## `Pos_SG-SF`  -1.45232   30.73982  -0.047 0.962340    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30.41 on 421 degrees of freedom
## Multiple R-squared:  0.6852, Adjusted R-squared:  0.6703 
## F-statistic: 45.83 on 20 and 421 DF,  p-value: < 2.2e-16

Figure 4 Final Model

While all 25 predictor variables namely, age, G, MP, STL, FT%, ORB, AST, BLK, TOV, PF, and POS (14 dummy variables) were in the model, the StepAIC selected only 8 variables to statistically significant at 0.05 and with the lowest AIC value. The selected variables include G, FT%, ORB, AST, Pos_PG and Pos_SG. The coefficient on G is 0.9795, implying that a unit increase in the number of games played increases 3 points by at least 0.97, approximately 1, presuming all other factors are held constant. The coefficient on Pos_SG (SG = 1, not SG = 0) is 7.993, implying that NBA players in SG (shooting guard) position score 8 3-P scores higher than all other positions, when other factors are also held constant. In contrast, NBA players in PG positions score 13.60 (approximately 14) less 3P scores

compared to players in other positions. The global F test informs us to reject the null hypothesis all coefficients are equal to zero because there is sufficient evidence to indicate that at least one covariate is statistically different from zero, F (8, 433) = 116.2, p-value = 2.2e-16 <0.05. Lastly, the adjusted R- squared is 0.6763, implying that this final model explains at least 67.63% of the variation in 3-P scores across NBA players. The resulting RMSE from prediction on test data is 30.583.

testing_set <- bake(log.rec.prep, test)
test[1:10, names(testing_set)]
## # A tibble: 10 × 22
##      Age     G `FT%`   ORB   DRB   AST   STL   BLK `Pos_C-PF` Pos_PF `Pos_PF-C`
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>      <int>  <int>      <int>
##  1    27    10 0.636     2     7    21     5     2          0      0          0
##  2    29    18 0.655    24    63    21    18     8          0      1          0
##  3    26    10 0.5       1    20     8     0     6          0      0          0
##  4    22     5 0.5       2     1     2     0     0          0      1          0
##  5    22    69 0.706    80   284   108    96    45          0      0          0
##  6    34    53 0.838    34   212    91    69    15          0      0          0
##  7    34    32 0.778    21   125    50    35     7          0      0          0
##  8    19    56 0.614    50   229   143    55    17          0      0          0
##  9    29    58 0.767    76   289   212    61    29          0      0          0
## 10    31    22 0.9      25    75    66    17     8          0      0          0
## # … with 11 more variables: `Pos_PF-SF` <int>, Pos_PG <int>, `Pos_PG-SG` <int>,
## #   Pos_SF <int>, `Pos_SF-C` <int>, `Pos_SF-PF` <int>, `Pos_SF-SG` <int>,
## #   Pos_SG <int>, `Pos_SG-PG` <int>, `Pos_SG-SF` <int>, `3P` <dbl>
fitted = predict(l.model, newdata = testing_set)
## Warning in predict.lm(l.model, newdata = testing_set): prediction from a rank-
## deficient fit may be misleading
actuals = as.numeric(testing_set$`3P`)

sqrt(mean((actuals - fitted)^2))
## [1] 30.18264
l.model.fin <- lm(`3P` ~., data = train_set) %>%
  stepAIC(trace = FALSE)

summary(l.model.fin)
## 
## Call:
## lm(formula = `3P` ~ G + `FT%` + ORB + DRB + AST + STL + Pos_PG + 
##     Pos_SG, data = train_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -147.628  -13.627   -1.535   11.659  144.382 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -28.42290    8.41698  -3.377 0.000799 ***
## G             0.97954    0.11033   8.878  < 2e-16 ***
## `FT%`        38.85695   10.36809   3.748 0.000203 ***
## ORB          -0.78189    0.06655 -11.749  < 2e-16 ***
## DRB           0.20192    0.03053   6.614 1.11e-10 ***
## AST           0.11160    0.02425   4.603 5.49e-06 ***
## STL           6.72707    2.67047   2.519 0.012126 *  
## Pos_PG      -13.60417    5.12053  -2.657 0.008180 ** 
## Pos_SG        7.99634    4.00340   1.997 0.046408 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30.13 on 433 degrees of freedom
## Multiple R-squared:  0.6822, Adjusted R-squared:  0.6763 
## F-statistic: 116.2 on 8 and 433 DF,  p-value: < 2.2e-16
testing_set <- bake(log.rec.prep, test)
test[1:10, names(testing_set)]
## # A tibble: 10 × 22
##      Age     G `FT%`   ORB   DRB   AST   STL   BLK `Pos_C-PF` Pos_PF `Pos_PF-C`
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>      <int>  <int>      <int>
##  1    27    10 0.636     2     7    21     5     2          0      0          0
##  2    29    18 0.655    24    63    21    18     8          0      1          0
##  3    26    10 0.5       1    20     8     0     6          0      0          0
##  4    22     5 0.5       2     1     2     0     0          0      1          0
##  5    22    69 0.706    80   284   108    96    45          0      0          0
##  6    34    53 0.838    34   212    91    69    15          0      0          0
##  7    34    32 0.778    21   125    50    35     7          0      0          0
##  8    19    56 0.614    50   229   143    55    17          0      0          0
##  9    29    58 0.767    76   289   212    61    29          0      0          0
## 10    31    22 0.9      25    75    66    17     8          0      0          0
## # … with 11 more variables: `Pos_PF-SF` <int>, Pos_PG <int>, `Pos_PG-SG` <int>,
## #   Pos_SF <int>, `Pos_SF-C` <int>, `Pos_SF-PF` <int>, `Pos_SF-SG` <int>,
## #   Pos_SG <int>, `Pos_SG-PG` <int>, `Pos_SG-SF` <int>, `3P` <dbl>
fitted = predict(l.model.fin, newdata = testing_set)
actuals = as.numeric(testing_set$`3P`)

sqrt(mean((actuals - fitted)^2))
## [1] 30.58373
plot(fitted,
     xlab = "Predicted Values",
     ylab = "Observed Values")+
  geom_point()+
abline(a = 0,                                     
       b = 1,
       col = "red",
       lwd = 2)

## integer(0)
rel_imp <- calc.relimp(l.model.fin, type = c("lmg"), rela = TRUE)
boo_rslts <- boot.relimp(l.model.fin, b=1000)
ci <- booteval.relimp(boo_rslts, norank = T)
plot(ci)

Figure 5 Relative Importance

Based on the bar chart, games played (G) are ranked as the important predictor of 3-P scores, followed closely by AST (number of assists), then STL (steals per game), (DRB) defensive, and (ORB) offensive rebounds. Scouts should pay more attention to these characteristics when scouting for NBA players to play in the professional league as it will guarantee positive returns on their investments.


Analyis Part 2

R Markdown

#load data into R
nba_data <- read_csv("../dataset-ignore/NBA Stats.csv")
head(nba_data) #glimpse of the data
## # A tibble: 6 × 34
##    ...1 Player Ratings Team    Age Height Weight College Country Draft…¹ Draft…²
##   <dbl> <chr>    <dbl> <chr> <dbl> <chr>   <dbl> <chr>   <chr>   <chr>   <chr>  
## 1     0 Kent …      76 ATL      29 6-5       201 Old     Domini… USA     Undraf…
## 2     1 Deway…      77 ATL      29 7-0       245 Southe… Califo… USA     Undraf…
## 3     2 John …      84 ATL      21 6-10      235 Wake    Forest  USA     2017   
## 4     3 Trae …      84 ATL      20 6-2       180 Oklaho… USA     2018    1      
## 5     4 Kevin…      74 ATL      20 6-7       190 Maryla… USA     2018    1      
## 6     5 Vince…      73 ATL      42 6-6       220 North   Caroli… USA     1998   
## # … with 23 more variables: `Draft Number` <chr>, Shoes <chr>, `2018/19` <chr>,
## #   gp <dbl>, min <dbl>, pts <dbl>, fgm <dbl>, fga <dbl>, `fg%` <dbl>,
## #   `3pm` <dbl>, `3pa` <dbl>, `3p%` <dbl>, ftm <dbl>, fta <dbl>, `ft%` <dbl>,
## #   oreb <dbl>, dreb <dbl>, reb <dbl>, ast <dbl>, stl <dbl>, blk <dbl>,
## #   tov <dbl>, eff <dbl>, and abbreviated variable names ¹​`Draft Year`,
## #   ²​`Draft Round`

#Data Preprocessing and Preparation

#filter to select players who made at least 1.5 3s per game
nba_data <- nba_data %>% 
  filter(`3pm` >= 1.5)
nba_data$Salary = nba_data$`2018/19`

#remove the $ sign
nba_data$Salary = as.numeric(gsub("[\\$,]", "", nba_data$Salary ))

#convert weight to kgs
nba_data$Weight = nba_data$Weight* 0.453592
#player height is in character form, convert to height in feet

#create a function to do so:
playerHeight <- function(x) {
        x1 <- as.numeric(sub("-.*", "", x))
        x2 <- as.numeric(sub(".*-", "", x))
        (x1 * 12) + x2
}
#apply the function to the height data values
nba_data$Height <- playerHeight(nba_data$Height)

#select relevant variables into 1 data frame
nba_df = subset(nba_data, select = c(Age,gp,min,ast,oreb,dreb,fta, stl,Shoes,blk,tov,Salary,Height, Weight,`3pm`))

#Exploratory Data Analysis (EDA)

#descriptive stats
summary(nba_df)
##       Age              gp             min             ast        
##  Min.   :19.00   Min.   :58.00   Min.   :17.20   Min.   : 0.500  
##  1st Qu.:24.00   1st Qu.:69.00   1st Qu.:27.30   1st Qu.: 1.800  
##  Median :27.00   Median :76.00   Median :30.90   Median : 2.800  
##  Mean   :27.04   Mean   :74.43   Mean   :29.66   Mean   : 3.464  
##  3rd Qu.:30.00   3rd Qu.:80.00   3rd Qu.:33.70   3rd Qu.: 4.600  
##  Max.   :42.00   Max.   :82.00   Max.   :36.90   Max.   :10.700  
##       oreb             dreb           fta              stl        
##  Min.   :0.1000   Min.   :1.30   Min.   : 0.600   Min.   :0.4000  
##  1st Qu.:0.4000   1st Qu.:2.50   1st Qu.: 1.400   1st Qu.:0.6000  
##  Median :0.7000   Median :3.50   Median : 2.400   Median :0.9000  
##  Mean   :0.7358   Mean   :3.69   Mean   : 3.105   Mean   :0.9383  
##  3rd Qu.:0.9000   3rd Qu.:4.30   3rd Qu.: 4.000   3rd Qu.:1.1000  
##  Max.   :3.4000   Max.   :9.60   Max.   :11.000   Max.   :2.2000  
##     Shoes                blk             tov            Salary        
##  Length:81          Min.   :0.000   Min.   :0.300   Min.   : 1378242  
##  Class :character   1st Qu.:0.200   1st Qu.:1.000   1st Qu.: 3447480  
##  Mode  :character   Median :0.300   Median :1.500   Median : 9367200  
##                     Mean   :0.384   Mean   :1.802   Mean   :12429394  
##                     3rd Qu.:0.400   3rd Qu.:2.300   3rd Qu.:19360228  
##                     Max.   :2.200   Max.   :5.000   Max.   :37457154  
##      Height          Weight            3pm      
##  Min.   :72.00   Min.   : 79.38   Min.   :1.50  
##  1st Qu.:76.00   1st Qu.: 90.72   1st Qu.:1.70  
##  Median :78.00   Median : 95.25   Median :1.90  
##  Mean   :77.77   Mean   : 95.23   Mean   :2.11  
##  3rd Qu.:80.00   3rd Qu.: 99.79   3rd Qu.:2.30  
##  Max.   :84.00   Max.   :122.47   Max.   :5.10

The descriptive statistic output shows the mean age of NBA players is approximately 27 years (27.04). The oldest player is 42 years and the youngest is 19 resulting in a range of about 23 years. The mean height is 77.77 inches (approximately 6.41 ft). The shortest NBA player is 6ft tall (72 inches) while the tallest is 7ft tall (84 inches), translating to a range of 12 inches. With regard to player performance, the maximum number of games a played is (gp) is 82 and the minimum is 58, and this variation could be related to player performance whereby a high-performing player is highly prioritized in terms of games allocation. The same logic applies to minutes played (MP). Our target variable is 3Pm, which is the number of 3-point scores made per game. The mean 3-P score is 2.11, the minimum is 1.5 and the maximum 3 point scores ever made by one player in one game is 5.11. The 75th quartile is 2.30, implying that at least 25% of all NBA players made above 2.3 scores per game. This cohort may represent elite players with consistently good performance, hence, sports organizations should focus on not to lose them to rival teams.

#histogram plots
ggplot(nba_df, aes(x =`3pm`))+
  geom_histogram(color = 'darkblue', fill ='lightblue')+
  ggtitle('Distribution of 3PM')+
  theme(plot.title = element_text(hjust = 0.5))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The histogram above illustrates the distribution of 3-p scores made (denoted as 3pm). It is evident that distribution of 3P scores is skewed to the right, as evidenced by most values that lie on the left side of the chart. This means that most NBA players score fewer 3P field goals. However, we can discern the presence of outliers – a bar far away from the rest of other bars. This outlier represents NBA players with exceptionally high 3P scores compared to the general population of NBA players. We are more interested in the general population, and hence, it was worthwhile to remove the outliers to avoid skewing the performance of our model.

#box plot of 3pm by shoes

ggplot(nba_data, aes(x = Shoes, y=`3pm`, fill=Shoes))+
  geom_boxplot(outlier.colour = 'red', notch = FALSE)+
  ggtitle('Box Plot of 3-Point Scores Made by Shoes Worn')+
  theme(plot.title = element_text(hjust = 0.5))

Figure 2

Figure 2 is a side-to-side box plot of the dependent variable, 3pm, by shoe brand. There appears to be an association between shoe and player scoring, as evidenced by the variation in mean 3P scores. Players using under armour made the highest 3p scores, followed at a distant by Jordan, Adidas, Nike, Puma, and so on. This statistical finding supports the inclusion of shoe brand as a predictor of the 3P score and checks whether the existing relationship is statistically significant.

col_df <- nba_data %>% group_by(College)%>% summarize(Mean_3Pm= mean(`3pm`))
con_df <-nba_data %>% group_by(Country)%>% summarize(Mean_3Pm= mean(`3pm`))

plot_ly(col_df, x = ~College, y = ~Mean_3Pm, type = 'bar', text = text)
plot_ly(con_df, x = ~Country, y = ~Mean_3Pm, type = 'bar', text = text)
#correlation analysis
cor_df <- subset(nba_df, select =-c(Shoes))
cor.mat  <- round(cor(cor_df),3) #correlation matrix
cormat_melt <- melt(cor.mat)

corr_heatmap <- ggplot(data = cormat_melt, aes(x = Var1, y=Var2, label = value, fill = value))+
  geom_tile()
corr_heatmap+
  geom_text(aes(Var1, Var2, label = value), color = "black", size = 4)

Figure 5 Correlation Matrix

The correlation plot above identifies the direction and magnitude of association between pairs of variables in our dataset. It is evident that there is a moderate positive relationship between 3P scores and assists (ast, r = 0.262), 3p scores and minutes played (min, r = 0.41), 3p scores and steals (stl, r =0.349), and 3p scores and age (r=0.221). It is interesting that correlation between player height and 3p scores is negative (r = -0.088). However, further analysis indicate that the correlation coefficient is not statistically different from zero, hence, height and 3p scores are unrelated. We can also identify the presence of multicollinearity issues in our dataset, as evidenced by the high correlation between some of the pairs of independent variables. For example, there is a high correlation between assists (ast) and turnover (tov, r = 0.858). A possible approach to address this issue is to remove one of the variables from each pair. We included a cut-off of 0.85 in our regression model function to eliminate one of the independent variables that is highly correlated, preferably one with a higher p-value.

#Interactive Charts

fig <- plot_ly(data = nba_df, x = ~Height, y = ~`3pm`)
fig <- fig %>%layout(title = 'A Scatter Plot of 3pt Percentage against Height')
fig
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
fig.2 <- plot_ly(data = nba_data, x = ~`3pm`, y = ~`3pa`, 
        title = "S")
fig.2 <- fig.2 %>%layout(title = 'A Scatter Plot of 3pt Percentage against 3pt Made')
fig.2
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning: 'scatter' objects don't have these attributes: 'title'
## Valid attributes include:
## 'cliponaxis', 'connectgaps', 'customdata', 'customdatasrc', 'dx', 'dy', 'error_x', 'error_y', 'fill', 'fillcolor', 'fillpattern', 'groupnorm', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hoveron', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'legendgroup', 'legendgrouptitle', 'legendrank', 'line', 'marker', 'meta', 'metasrc', 'mode', 'name', 'opacity', 'orientation', 'selected', 'selectedpoints', 'showlegend', 'stackgaps', 'stackgroup', 'stream', 'text', 'textfont', 'textposition', 'textpositionsrc', 'textsrc', 'texttemplate', 'texttemplatesrc', 'transforms', 'type', 'uid', 'uirevision', 'unselected', 'visible', 'x', 'x0', 'xaxis', 'xcalendar', 'xhoverformat', 'xperiod', 'xperiod0', 'xperiodalignment', 'xsrc', 'y', 'y0', 'yaxis', 'ycalendar', 'yhoverformat', 'yperiod', 'yperiod0', 'yperiodalignment', 'ysrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
fig.3 <- plot_ly(data = nba_data, x = ~eff, y = ~`3pm`)
fig.3 <- fig %>%layout(title = 'A Scatter Plot of 3pt Percentage against Efficiency')
fig.3
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

Figure 4 Scatterplot (3pm vs Height)

We cannot discern any relationship between 3pm and height, as the points appear random, with no visual pattern to indicate association between player’s height and 3-points scores made.

fg <- plot_ly(data = nba_df, x = ~Age, y = ~`3pm`)
fg %>%layout(title = 'A Scatter Plot of 3pt Percentage against Player Age')
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
fg
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

Figure 3 Scatter plot (3pm vs Age)

The scatterplot above visualizes the relationship between player score performance (3p scores per game) by age. There appears to be a positive relationship between age and 3-points scoring. A player is at prime in mid 20s and early 30s, and start to decline as he approaches 35 years.

#Full Model

set.seed(123)
#define 3pm as y
nba_df$y = nba_df$`3pm`
#remove 3pm
nba_df = subset(nba_df, select = -(`3pm`))

#use 70% of dataset as training set and 30% as test set
sample <- sample.split(nba_df$y, SplitRatio = 0.8)
train  <- subset(nba_df, sample == TRUE)
test   <- subset(nba_df, sample == FALSE)

rec <- recipe(
  y ~.,
  data = train)%>%
  step_normalize(Salary, Age, gp, min, Height, Weight)%>%
  step_corr(all_numeric_predictors(), threshold = 0.85)%>%
  step_dummy(Shoes)

log.rec.prep <- prep(rec, training = train) #prepare recipe

train_set <- log.rec.prep %>% #replicate data transformation results to testing data
  bake(new_data = NULL)
full.model = lm(y~., data = train_set)
summary(full.model)
## 
## Call:
## lm(formula = y ~ ., data = train_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.34508 -0.31909  0.00493  0.21422  1.34508 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.86389    0.38559   4.834  1.6e-05 ***
## Age                 0.25070    0.10389   2.413  0.01995 *  
## gp                  0.08763    0.07585   1.155  0.25409    
## min                 0.30377    0.13823   2.198  0.03316 *  
## ast                -0.13182    0.06877  -1.917  0.06164 .  
## oreb               -0.15377    0.21977  -0.700  0.48771    
## dreb               -0.05807    0.09053  -0.641  0.52450    
## fta                 0.18309    0.06245   2.932  0.00528 ** 
## stl                 0.48918    0.25752   1.900  0.06391 .  
## blk                -0.02152    0.25765  -0.084  0.93380    
## Salary             -0.10557    0.13114  -0.805  0.42504    
## Height             -0.15203    0.13879  -1.095  0.27917    
## Weight              0.05863    0.12728   0.461  0.64727    
## Shoes_Anta          0.60839    0.61274   0.993  0.32606    
## Shoes_Jordan       -0.17371    0.28513  -0.609  0.54542    
## Shoes_K8IROS       -0.34275    0.61660  -0.556  0.58106    
## Shoes_New.Balance  -1.10735    0.66754  -1.659  0.10410    
## Shoes_Nike          0.04406    0.19222   0.229  0.81974    
## Shoes_Puma          0.01156    0.45995   0.025  0.98005    
## Shoes_Q4           -0.08735    0.62089  -0.141  0.88875    
## Shoes_Under.Armour  1.29128    0.46922   2.752  0.00851 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5519 on 45 degrees of freedom
## Multiple R-squared:  0.5765, Adjusted R-squared:  0.3883 
## F-statistic: 3.063 on 20 and 45 DF,  p-value: 0.0009186

#Model Selection

Optimal Model: Based on the Lowest AIC Value

Figure 6 Figure 6 shows the final model containing best subsets of variables for predicting offensive performance of an NBA player, measured by 3-point scores made (3pm).The selection of the variables to include in final model was based on the lowest AIC value. We chose AIC as our ideal model selection criterion because it results in the selection of the model that bests fits the data and it is moderately tolerant compared to BIC (Yang & Berdine, 2015). The selected variables include age, games played (gp), minutes played (min), assist (ast), offensive rebounds (oreb), steals (stl), free throws (fta), and shoes. The coefficient on age, min, fta, stl, shoes (new balance and under armour) are statistically significant at 0.01 because their corresponding p- values is less than 0.01 (see figure 6). To illustrate a few variables, the coefficient on age is 0.1976, implying that an increase in player age by one year is associated with an increase in 3pt scores by 0.2, presuming other factors are held constant. Furthermore, an NBA player wearing under armour during play is likely to make more 3pt scores compared to a player wearing other brands, such as Under Armour, or Adidas. The resulting equation for predicting player performance is: Y i = 1.9627 + 0.1976X 1 + 0.091X 2 + 0.294X 3 - 0.165X 4 – 0.254X 5 – 0.152X 6 + 0.474X 7 – 0.148X 8 -1.10X 9 + 1.139X 10 , where X 1 denotes age, X 2 is gp, X 3 is min, X 4 is ast, X 5 is oreb, X 6 is fta, X 7 is stl, X 8 is height, X 9 is Shoes_New.Balance and X 10 is Shoes.Under.Armour. The global F test informs us to reject the null hypothesis all coefficients are equal to zero because there is sufficient evidence to indicate that at least one covariate is statistically different from zero, F (10, 55) = 6.503, p-value = 1.566 e-06 <0.05. Lastly, the adjusted R-squared is 0.4585, implying that this final model explains at least 45.9% of the variation in 3-P scores across NBA players.

Predictive Performance on Testing Set

Figure 7 is a scatter plot of actual vs fitted values of 3-points scores made. It is evident that most values lie close to the regression line and are within the 95% confidence level region. However, some of the values we either under-predicted or over-predicted, as evidenced by the points lying far away from the regression line and outside the grey region. The calculated RMSE is 0.4868, which is relatively close to 0, hence, the model did great in terms of predicting player performance.

#ols_step_best_subset(full.model)

#final model

set.seed(123)
final.model <- stepAIC(full.model, direction ='backward', trace = FALSE)
reg.summary = summary(final.model)
reg.summary
## 
## Call:
## lm(formula = y ~ Age + gp + min + ast + oreb + fta + stl + Height + 
##     Shoes_New.Balance + Shoes_Under.Armour, data = train_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2561 -0.2950 -0.0173  0.2812  1.2561 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.96274    0.25620   7.661 3.12e-10 ***
## Age                 0.19760    0.07149   2.764  0.00775 ** 
## gp                  0.09051    0.06932   1.306  0.19709    
## min                 0.29394    0.11231   2.617  0.01142 *  
## ast                -0.16537    0.05823  -2.840  0.00632 ** 
## oreb               -0.25388    0.16733  -1.517  0.13494    
## fta                 0.15153    0.05017   3.020  0.00382 ** 
## stl                 0.47401    0.23129   2.049  0.04520 *  
## Height             -0.14752    0.09039  -1.632  0.10839    
## Shoes_New.Balance  -1.10227    0.61421  -1.795  0.07821 .  
## Shoes_Under.Armour  1.13869    0.39420   2.889  0.00552 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5193 on 55 degrees of freedom
## Multiple R-squared:  0.5418, Adjusted R-squared:  0.4585 
## F-statistic: 6.503 on 10 and 55 DF,  p-value: 1.566e-06
 #make predictions on testing dataset using the full model
testing_set <- bake(log.rec.prep, test)
testing_set$fitted = predict(final.model, newdata = testing_set)

ggplot(testing_set, aes(x = y, y=fitted))+
  geom_point()+
  geom_smooth(method='lm')+
  ggtitle('fitted vs actual plot')+
  ylab('Actual')+
  xlab('Fitted')+
  theme(plot.title = element_text(hjust = 0.5))
## `geom_smooth()` using formula 'y ~ x'

Figure 7

#RMSE
sqrt(mean((testing_set$y - testing_set$fitted)^2))#print RMSE
## [1] 0.4868246
#relative importance of predictors
rel_imp <- calc.relimp(final.model, type = c("lmg"), rela = TRUE)
rel_imp
## Response variable: y 
## Total response variance: 0.4979138 
## Analysis based on 66 observations 
## 
## 10 Regressors: 
## Age gp min ast oreb fta stl Height Shoes_New.Balance Shoes_Under.Armour 
## Proportion of variance explained by model: 54.18%
## Metrics are normalized to sum to 100% (rela=TRUE). 
## 
## Relative importance metrics: 
## 
##                           lmg
## Age                0.13630060
## gp                 0.03241851
## min                0.17495645
## ast                0.07470439
## oreb               0.04049944
## fta                0.19755163
## stl                0.12670609
## Height             0.02971409
## Shoes_New.Balance  0.02857386
## Shoes_Under.Armour 0.15857495
## 
## Average coefficients for different model sizes: 
## 
##                             1X         2Xs         3Xs          4Xs         5Xs
## Age                 0.18843249  0.19682366  0.20119811  0.202749585  0.20255063
## gp                  0.07341793  0.08303488  0.09118498  0.097550228  0.10198134
## min                 0.28117460  0.27301450  0.26944267  0.269365898  0.27186635
## ast                 0.08358611  0.05570702  0.02677829 -0.002545732 -0.03183065
## oreb               -0.03286856 -0.11077718 -0.17132014 -0.216486952 -0.24813737
## fta                 0.13771387  0.13554094  0.13422713  0.133845041  0.13449459
## stl                 0.65297450  0.60607408  0.56231969  0.525101858  0.49662065
## Height             -0.07589576 -0.06823530 -0.06386090 -0.063807021 -0.06848615
## Shoes_New.Balance  -0.25692308 -0.39081974 -0.49618622 -0.581271368 -0.65560592
## Shoes_Under.Armour  1.18281250  1.21881034  1.24382312  1.259549900  1.26638118
##                            6Xs         7Xs        8Xs         9Xs        10Xs
## Age                 0.20149636  0.20021763  0.1990540  0.19813601  0.19760080
## gp                  0.10436893  0.10455909  0.1023667  0.09767239  0.09051081
## min                 0.27609837  0.28122254  0.2863954  0.29083274  0.29393900
## ast                -0.06074542 -0.08896621 -0.1161155 -0.14174425 -0.16536503
## oreb               -0.26791949 -0.27727600 -0.2774498 -0.26944217 -0.25387682
## fta                 0.13622786  0.13900043  0.1426556  0.14693799  0.15152709
## stl                 0.47772894  0.46797791  0.4658055  0.46880077  0.47401157
## Height             -0.07781629 -0.09129638 -0.1080872 -0.12715290 -0.14751522
## Shoes_New.Balance  -0.72835035 -0.80682158 -0.8953217 -0.99458905 -1.10226842
## Shoes_Under.Armour  1.26370269  1.25037916  1.2253556  1.18812428  1.13869117

Relative Importance Based on the output below, free throws made (fta) and minutes played (min) are ranked as the important predictor of 3-P scores, followed closely by Shoes (Under Armour), then age and steals (stl). Scouts should pay more attention to these metrics when scouting for NBA players to play in the professional league as it will guarantee positive returns on their investments.


Rubric: On this page

you will

  • Introduce what motivates your Data Analysis (DA)
    • Which variables and relationships are you most interested in?
    • What questions are you interested in answering?
  • Breadth of the DA
    • Make sure that you ask enough initial questions to explore the different variables in your data.
    • i.e. Do you explore more than just one or two variables? Do you explore a few different relationships or many?
  • Depth of the DA
    • When you answer one question, usually more questions arise as well.
    • The depth of the DA is about coming up with and exploring the answers to these questions, often iterating the process a few times.
  • Modeling and Inference
    • You should also include some kind of formal statistical model and/or inference. This could be a linear regression, logistic regression, hypothesis testing etc.
    • Explain the techniques you used for validating your results.
    • Describe the results of your modelling and make sure to give a sense of the uncertainty in your estimates and conclusions.
  • Explain the flaws and limitations of your analysis
    • Are there some assumptions that you needed to make that might not hold? Is there other data that would help to answer your questions? …
  • Clarity Figures
    • Are your figures/tables/results easy to read, informative, without problems like overplotting, hard-to-read labels, etc?
    • Each figure should provide a key insight. Too many figures or other data summaries can detract from this.
  • Clarity of Explanations
    • Do you introduce why you are doing each analysis?
    • How well do you explain each figure/result?
    • Do you provide interpretations that suggest further analysis or explanations for observed phenomenon?
  • Organization and cleanliness.
    • Make sure to remove excessive warnings, use clean easy-to-read code, organize with sections or multiple pages, use bullets, etc.

NOTE: Your Data Analysis can be broken up into multiple pages if that helps with your organization.